library(ggplot2)
library(nortest)
movies<- read.csv("../datos/movies_2026.csv")
colnames(movies)
[1] "id" "budget"
[3] "genres" "homePage"
[5] "productionCompany" "productionCompanyCountry"
[7] "productionCountry" "revenue"
[9] "runtime" "video"
[11] "director" "actors"
[13] "actorsPopularity" "actorsCharacter"
[15] "originalTitle" "title"
[17] "originalLanguage" "popularity"
[19] "releaseDate" "voteAvg"
[21] "voteCount" "genresAmount"
[23] "productionCoAmount" "productionCountriesAmount"
[25] "actorsAmount" "castWomenAmount"
[27] "castMenAmount" "releaseYear"
numericas <- c("budget","revenue","runtime","genresAmount","productionCoAmount","productionCountriesAmount","actorsAmount","castWomenAmount",
"castMenAmount","releaseYear", "popularity", "voteCount", "voteAvg") #Seleccionar solo numéricas
movies_num <- movies[, numericas]
colnames(movies_num)
[1] "budget" "revenue"
[3] "runtime" "genresAmount"
[5] "productionCoAmount" "productionCountriesAmount"
[7] "actorsAmount" "castWomenAmount"
[9] "castMenAmount" "releaseYear"
[11] "popularity" "voteCount"
[13] "voteAvg"
movies_cat <- movies[, !names(movies) %in% c(numericas, "id")] #Seleccionar solo categóricas
names(movies_cat)
[1] "genres" "homePage" "productionCompany"
[4] "productionCompanyCountry" "productionCountry" "video"
[7] "director" "actors" "actorsPopularity"
[10] "actorsCharacter" "originalTitle" "title"
[13] "originalLanguage" "releaseDate"
El conjunto de datos está formado por ’13 variables cuantitativas y ’14 variables cualitativas.
summary(movies_num)
budget revenue runtime genresAmount
Min. : 0 Min. :0.000e+00 Min. : 0.00 Min. : 0.000
1st Qu.: 0 1st Qu.:0.000e+00 1st Qu.: 10.00 1st Qu.: 1.000
Median : 0 Median :0.000e+00 Median : 86.00 Median : 2.000
Mean : 9413280 Mean :2.879e+07 Mean : 66.09 Mean : 1.949
3rd Qu.: 1000000 3rd Qu.:3.306e+05 3rd Qu.:103.00 3rd Qu.: 3.000
Max. :380000000 Max. :2.847e+09 Max. :750.00 Max. :16.000
productionCoAmount productionCountriesAmount actorsAmount castWomenAmount
Min. : 0.000 Min. : 0.00 Min. : 0 Min. : 0
1st Qu.: 0.000 1st Qu.: 1.00 1st Qu.: 3 1st Qu.: 0
Median : 1.000 Median : 1.00 Median : 9 Median : 2
Mean : 1.973 Mean : 1.23 Mean : 1082 Mean : 3517
3rd Qu.: 3.000 3rd Qu.: 1.00 3rd Qu.: 21 3rd Qu.: 6
Max. :89.000 Max. :155.00 Max. :919590 Max. :922162
NA's :37
castMenAmount releaseYear popularity voteCount
Min. : 0 Min. :1902 Min. :0.000e+00 Min. : 0.0
1st Qu.: 0 1st Qu.:2013 1st Qu.:5.460e-02 1st Qu.: 0.0
Median : 3 Median :2021 Median :8.502e+00 Median : 6.0
Mean : 8224 Mean :2017 Mean :2.625e+01 Mean : 675.9
3rd Qu.: 12 3rd Qu.:2025 3rd Qu.:2.224e+01 3rd Qu.: 423.0
Max. :922017 Max. :2026 Max. :1.147e+04 Max. :30788.0
NA's :162 NA's :2
voteAvg
Min. : 0.000
1st Qu.: 0.000
Median : 5.400
Mean : 3.837
3rd Qu.: 6.800
Max. :10.000
hist(movies_num$budget)
boxplot(movies_num$budget)
lillie.test(movies_num$budget)
Lilliefors (Kolmogorov-Smirnov) normality test
data: movies_num$budget
D = 0.37204, p-value < 2.2e-16
La variable “budget” no sigue una distribución normal.
hist(movies_num$revenue)
boxplot(movies_num$revenue)
lillie.test(movies_num$revenue)
Lilliefors (Kolmogorov-Smirnov) normality test
data: movies_num$revenue
D = 0.39765, p-value < 2.2e-16
hist(movies_num$runtime)
boxplot(movies_num$runtime)
lillie.test(movies_num$runtime)
Lilliefors (Kolmogorov-Smirnov) normality test
data: movies_num$runtime
D = 0.1629, p-value < 2.2e-16
hist(movies_num$genresAmount)
boxplot(movies_num$genresAmount)
lillie.test(movies_num$genresAmount)
Lilliefors (Kolmogorov-Smirnov) normality test
data: movies_num$genresAmount
D = 0.19266, p-value < 2.2e-16
hist(movies_num$productionCoAmount)
boxplot(movies_num$productionCoAmount)
lillie.test(movies_num$productionCoAmount)
Lilliefors (Kolmogorov-Smirnov) normality test
data: movies_num$productionCoAmount
D = 0.20818, p-value < 2.2e-16
hist(movies_num$productionCountriesAmount)
boxplot(movies_num$productionCountriesAmount)
lillie.test(movies_num$productionCountriesAmount)
Lilliefors (Kolmogorov-Smirnov) normality test
data: movies_num$productionCountriesAmount
D = 0.3698, p-value < 2.2e-16
hist(movies_num$actorsAmount)
boxplot(movies_num$actorsAmount)
lillie.test(movies_num$actorsAmount)
Lilliefors (Kolmogorov-Smirnov) normality test
data: movies_num$actorsAmount
D = 0.51124, p-value < 2.2e-16
hist(movies_num$castWomenAmount)
boxplot(movies_num$castWomenAmount)
lillie.test(movies_num$castWomenAmount)
Lilliefors (Kolmogorov-Smirnov) normality test
data: movies_num$castWomenAmount
D = 0.52228, p-value < 2.2e-16
hist(movies_num$castMenAmount)
boxplot(movies_num$castMenAmount)
lillie.test(movies_num$castMenAmount)
Lilliefors (Kolmogorov-Smirnov) normality test
data: movies_num$castMenAmount
D = 0.52698, p-value < 2.2e-16
hist(movies_num$releaseYear)
boxplot(movies_num$releaseYear)
lillie.test(movies_num$releaseYear)
Lilliefors (Kolmogorov-Smirnov) normality test
data: movies_num$releaseYear
D = 0.23746, p-value < 2.2e-16
hist(movies_num$popularity)
boxplot(movies_num$popularity)
lillie.test(movies_num$popularity)
Lilliefors (Kolmogorov-Smirnov) normality test
data: movies_num$popularity
D = 0.43322, p-value < 2.2e-16
hist(movies_num$voteCount)
boxplot(movies_num$voteCount)
lillie.test(movies_num$voteCount)
Lilliefors (Kolmogorov-Smirnov) normality test
data: movies_num$voteCount
D = 0.36364, p-value < 2.2e-16
hist(movies_num$voteAvg)
boxplot(movies_num$voteAvg)
lillie.test(movies_num$voteAvg)
Lilliefors (Kolmogorov-Smirnov) normality test
data: movies_num$voteAvg
D = 0.28937, p-value < 2.2e-16
table(movies$originalLanguage)
ab af am ar as az be bg bn bs ca cn cs
1 4 15 106 2 17 1 5 45 6 36 100 36
cy da de dv el en es et eu fa fi fr ga
2 94 461 3 39 11961 1238 36 3 70 45 1094 4
gl gu he hi hr ht hu hy id ig is it ja
3 6 17 100 17 1 29 6 173 1 7 302 868
jv ka kk km kn ko ku ky la lb lt lv mk
5 12 11 14 30 336 10 6 1 2 34 24 4
ml mn mo mr ms mt my nb ne nl no or pa
59 2 1 20 17 2 5 7 5 192 54 2 7
pl pt qu ro ru rw se si sk sl sn so sq
80 628 1 25 190 2 3 3 8 9 1 1 10
sr sv sw ta te th tl tr uk ur uz vi xh
23 133 1 74 57 59 110 106 43 8 4 29 1
xx zh zu
92 365 1
table(movies$video)
FALSE TRUE
19313 84
top_budget <- movies[order(movies$budget, decreasing = TRUE), c("originalTitle", "budget")]
head(top_budget, 10)
top_revenue <- movies[order(movies$revenue, decreasing = TRUE), c("originalTitle", "revenue")]
head(top_revenue, 10)
mas_votos <- movies[which.max(movies$voteCount), c("originalTitle", "voteCount")]
print(mas_votos)
originalTitle voteCount
13402 Inception 30788
peor_pelicula <- movies[which.min(movies$voteAvg), c("originalTitle", "voteAvg")]
print(peor_pelicula)
originalTitle voteAvg
1 غوطه ور 0
#¿Cuántas películas se hicieron en cada año?
peliculas_por_anio <- table(movies$releaseYear)
df_peliculas_anio <- as.data.frame(peliculas_por_anio)
colnames(df_peliculas_anio) <- c("Anio", "Cantidad")
#¿En qué año se hicieron más películas?
anio_mas_productivo <- df_peliculas_anio[which.max(df_peliculas_anio$Cantidad), ]
print(paste("El año con más películas fue:", anio_mas_productivo$Anio, "con", anio_mas_productivo$Cantidad, "películas."))
[1] "El año con más películas fue: 2025 con 7351 películas."
#Grafica de barras
df_peliculas_anio$AnioNum <- as.numeric(as.character(df_peliculas_anio$Anio))
ggplot(df_peliculas_anio, aes(x = AnioNum, y = Cantidad)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Cantidad de Películas por Año",
x = "Año de Lanzamiento",
y = "Número de Películas") +
theme_minimal()
movies$releaseDate <- as.Date(movies$releaseDate)
peliculas_recientes <- movies[order(movies$releaseDate, decreasing = TRUE), c("originalTitle", "releaseDate")]
v20_peliculas_recientes <- head(peliculas_recientes, 20)
v20_peliculas_recientes